
# ******************************************************************************
# Copyright 2017-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************

--:-:1:-:2      S2R Tid, SR_TID.X;
<SCHEDULE_BLOCK>
--:-:-:-:1      MOV alpha16, param_alpha;

01:-:-:-:1      LOP.AND  Tid32_2,  Tid,    -32;
--:-:-:-:1      SHR.U32  Tid32_2,  Tid32_2, 2;

// readFs = ((tid & 16) >> 3) | (tid & 1)
--:-:-:-:1      LOP.AND Tid1,   Tid,    1;
01:-:-:-:1      LOP.AND readFs, Tid,    16;
--:-:-:-:1      SHR.U32 readFs, readFs, 3;
--:-:-:-:1      IADD    readFs, readFs, Tid1;

// readIs = ((tid & -32) >> 2) | ((tid >> 1) & 7) | (readFs << 2)
--:-:-:-:1      BFE.U32 readIs, Tid,    0x301; // 3 bits at position 1
--:-:-:-:1      LOP.OR  readIs, readIs, Tid32_2;
--:-:-:-:1      ISCADD  readIs, readFs, readIs, 2;

--:-:-:-:1      SHL readIs, readIs, 4;
--:-:-:-:1      SHL readFs, readFs, 3;

// writeCs = readFs * 32*36 + readIs;
--:-:-:-:1      XMAD write16Cs, readFs, 1x<32*36>, readIs;
</SCHEDULE_BLOCK>

--:-:-:-:1      FMUL shuffle16_x0y0, clx0y0, alpha16;
--:-:-:-:1      FMUL shuffle16_x1y0, clx1y0, alpha16;
--:-:-:-:1      FMUL shuffle16_x2y0, clx2y0, alpha16;
--:-:-:-:1      FMUL shuffle16_x3y0, clx3y0, alpha16;
--:-:-:-:1      FMUL shuffle16_x0y1, clx0y2, alpha16;
--:-:-:-:1      FMUL shuffle16_x1y1, clx1y2, alpha16;
--:-:-:-:1      FMUL shuffle16_x2y1, clx2y2, alpha16;
--:-:-:-:0      FMUL shuffle16_x3y1, clx3y2, alpha16;
--:-:-:-:4      STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0;
--:-:-:-:d      STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1;
--:-:-:-:5      BAR.SYNC 0;

--:-:-:-:1      FMUL shuffle16_x0y0, clx0y1, alpha16;
--:-:-:-:1      FMUL shuffle16_x1y0, clx1y1, alpha16;
--:-:-:-:1      FMUL shuffle16_x2y0, clx2y1, alpha16;
--:-:-:-:1      FMUL shuffle16_x3y0, clx3y1, alpha16;
--:-:-:-:1      FMUL shuffle16_x0y1, clx0y3, alpha16;
--:-:-:-:1      FMUL shuffle16_x1y1, clx1y3, alpha16;
--:-:-:-:1      FMUL shuffle16_x2y1, clx2y3, alpha16;
--:-:-:-:0      FMUL shuffle16_x3y1, clx3y3, alpha16;
--:-:-:-:5      BAR.SYNC 0;
--:-:-:-:1      STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0;
--:-:-:-:d      STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1;
--:-:-:-:5      BAR.SYNC 0;

--:-:-:-:1      FMUL shuffle16_x0y0, clx0y4, alpha16;
--:-:-:-:1      FMUL shuffle16_x1y0, clx1y4, alpha16;
--:-:-:-:1      FMUL shuffle16_x2y0, clx2y4, alpha16;
--:-:-:-:1      FMUL shuffle16_x3y0, clx3y4, alpha16;
--:-:-:-:1      FMUL shuffle16_x0y1, clx0y6, alpha16;
--:-:-:-:1      FMUL shuffle16_x1y1, clx1y6, alpha16;
--:-:-:-:1      FMUL shuffle16_x2y1, clx2y6, alpha16;
--:-:-:-:0      FMUL shuffle16_x3y1, clx3y6, alpha16;
--:-:-:-:5      BAR.SYNC 0;
--:-:-:-:1      STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0;
--:-:-:-:d      STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1;
--:-:-:-:5      BAR.SYNC 0;

--:-:-:-:1      FMUL shuffle16_x0y0, clx0y5, alpha16;
--:-:-:-:1      FMUL shuffle16_x1y0, clx1y5, alpha16;
--:-:-:-:1      FMUL shuffle16_x2y0, clx2y5, alpha16;
--:-:-:-:1      FMUL shuffle16_x3y0, clx3y5, alpha16;
--:-:-:-:1      FMUL shuffle16_x0y1, clx0y7, alpha16;
--:-:-:-:1      FMUL shuffle16_x1y1, clx1y7, alpha16;
--:-:-:-:1      FMUL shuffle16_x2y1, clx2y7, alpha16;
--:-:-:-:0      FMUL shuffle16_x3y1, clx3y7, alpha16;
--:-:-:-:5      BAR.SYNC 0;
--:-:-:-:1      STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0;
--:-:-:-:d      STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1;
--:-:-:-:5      BAR.SYNC 0;

--:-:-:-:5      EXIT;

COMPUTE_FINISH:

--:-:1:-:2      S2R tid_128, SR_TID.X;
<SCHEDULE_BLOCK>

--:-:-:-:1      MOV alpha, param_alpha;

01:-:-:-:1      IADD tid_128, tid_128, -128;

--:-:-:-:1      ISETP.GE.AND P4, PT, tid_128, 256, PT;

// readFs = ((tid &  8) >> 2) | (tid & 1)
--:-:-:-:1      LOP.AND  Tid_1,   tid_128, 1;
--:-:-:-:1      LOP.AND  readFs2, tid_128, 8;
--:-:-:-:1      SHR.U32  readFs2, readFs2, 2;
--:-:-:-:1      IADD     readFs2, readFs2, Tid_1;

// readIs = ((tid & -16) >> 1) | ((tid >> 1) & 3) | (readFs << 2)
--:-:-:-:1      LOP.AND  tid_16,   tid_128, -16;
--:-:-:-:1      SHR.U32  tid_16,   tid_16,   1;
--:-:-:-:1      BFE.U32  readIs2,  tid_128,  0x201; // 2 bits at position 1
--:-:-:-:1      LOP.OR   readIs2,  readIs2,  tid_16;
--:-:-:-:1      ISCADD   readIs2,  readFs2, readIs2, 2;

--:-:-:-:1      ISCADD   readIs2, readIs2, 4x<32*4>, 4;
--:-:-:-:1      SHL      readFs2, readFs2, 3;

// writeCs = readFs * 32*36 + readIs;
--:-:-:-:0      XMAD writeCs, readFs2, 1x<32*36>, readIs2;
</SCHEDULE_BLOCK>

--:-:-:-:5  @P4 BRA.U SKIP0;

--:-:2:-:1      LDS idxX, [addr_idx_X];
--:-:3:-:1      LDS idxY, [addr_idx_Y];
--:-:1:-:1      S2R idxN,  SR_CTAID.Z;
--:-:4:-:1      LDS idxK, [addr_idx_K];
<SCHEDULE_BLOCK>

--:-:-:-:1      LOP.AND tid_31, tid_128, 31;
--:-:-:-:1      SHR.U32 tid_32, tid_128,  5;
--:-:-:-:1      SHR.U32 tid_64, tid_128,  6;

[+
    our $bsum; return $bsum ? q{
03:-:-:-:1      XMAD      bsum_offset, idxX, param_gridN,   idxN;
04:-:-:-:1      XMAD.LO2C bsum_offset, idxY, param_gridQN,  bsum_offset;
    } : '';
+]

--:-:-:-:1      MOV32I one, 1.0;

// readCs = tid_32 * 32*36 + tid_31 + tid_64 * 16
--:-:-:-:1      XMAD   readCs, tid_32, 1x<32*36>, tid_31;
--:-:-:-:1      ISCADD readCs, tid_64, readCs, 4;
--:-:-:-:1      SHL    readCs, readCs, 2;

// n = idxN*32 + tid & maskN
--:-:-:-:1      LOP.AND n, tid_31, param_maskN;
01:-:-:-:1      ISCADD  n, idxN, n, 5;

// Superblock offset
// idxX <<= shiftX
// idxX <<= shiftY
02:-:-:-:1      SHL idxX, idxX, param_shiftX;
04:-:-:-:1      SHL idxY, idxY, param_shiftY;

// Get this threads offset within the superblock
--:-:-:-:1      BFE.U32 q, tid_31, param_superX;
--:-:-:-:1      BFE.U32 p, tid_31, param_superY;
--:-:-:-:1      ISCADD q, q, idxX, 2;
--:-:-:-:1      ISCADD p, p, idxY, 2;

// k = idxK*32 + tid_32<<1
--:-:-:-:1      SHL tid_32, tid_32,   1;
08:-:-:-:1      ISCADD k, idxK, tid_32, 5;

// Out = k*PQN + p*QN + q*N + n
--:-:-:-:1      XMAD      offsetO, q, param_N,    n;
--:-:-:-:1      XMAD.LO2C offsetO, p, param_QN,   offsetO;
--:-:-:-:1      XMAD.LO2C offsetO, k, param_PQN,  offsetO;

--:-:-:-:1      IADD z1, q, 1;
--:-:-:-:1      IADD z2, q, 2;
--:-:-:-:1      IADD z3, q, 3;

--:-:-:-:1      ISETP.EQ.AND P5, PT, RZ, param_flags, PT; // ! no-op
--:-:-:-:1      ISETP.LT.AND P6, PT, n, param_N, PT;

--:-:-:-:1      ISETP.LT.AND P0, PT, q,  param_Q, P5;
--:-:-:-:1      ISETP.LT.AND P1, PT, z1, param_Q, P5;
--:-:-:-:1      ISETP.LT.AND P2, PT, z2, param_Q, P5;
--:-:-:-:1      ISETP.LT.AND P3, PT, z3, param_Q, P5;
--:-:-:-:1      ISETP.GE.AND P0, PT, q,  RZ, P0;
--:-:-:-:1      ISETP.GE.AND P1, PT, z1, RZ, P1;
--:-:-:-:1      ISETP.GE.AND P2, PT, z2, RZ, P2;
--:-:-:-:1      ISETP.GE.AND P3, PT, z3, RZ, P3;
--:-:-:-:1      P2R mask_q, PR, RZ, 0x0f;

--:-:-:-:1      IADD z1, p, 1;
--:-:-:-:1      IADD z2, p, 2;
--:-:-:-:1      IADD z3, p, 3;
--:-:-:-:1      ISETP.LT.AND P0, PT, p,  param_P, P6;
--:-:-:-:1      ISETP.LT.AND P1, PT, z1, param_P, P6;
--:-:-:-:1      ISETP.LT.AND P2, PT, z2, param_P, P6;
--:-:-:-:1      ISETP.LT.AND P3, PT, z3, param_P, P6;
--:-:-:-:1      ISETP.GE.AND P0, PT, p,  RZ, P0;
--:-:-:-:1      ISETP.GE.AND P1, PT, z1, RZ, P1;
--:-:-:-:1      ISETP.GE.AND P2, PT, z2, RZ, P2;
--:-:-:-:1      ISETP.GE.AND P3, PT, z3, RZ, P3;

--:-:-:-:1      SEL preds, mask_q, RZ, P0;
--:-:-:-:1  @P1 BFI preds, mask_q, 0x404, preds;
--:-:-:-:1  @P2 BFI preds, mask_q, 0x408, preds;
--:-:-:-:1  @P3 BFI preds, mask_q, 0x40c, preds;

--:-:-:-:1      ISETP.EQ.AND P6, PT, tid_31, RZ, PT;
</SCHEDULE_BLOCK>

SKIP0:

<SCHEDULE_BLOCK>
--:-:-:-:1      FMUL shuffle_x0y0, ccx0y0, alpha;
--:-:-:-:1      FMUL shuffle_x1y0, ccx1y0, alpha;
--:-:-:-:1      FMUL shuffle_x2y0, ccx2y0, alpha;
--:-:-:-:1      FMUL shuffle_x3y0, ccx3y0, alpha;
--:-:-:-:1      FMUL shuffle_x4y0, ccx4y0, alpha;
--:-:-:-:1      FMUL shuffle_x5y0, ccx5y0, alpha;
--:-:-:-:1      FMUL shuffle_x6y0, ccx6y0, alpha;
--:-:-:-:1      FMUL shuffle_x7y0, ccx7y0, alpha;
--:-:-:-:1      FMUL shuffle_x0y1, ccx0y2, alpha;
--:-:-:-:1      FMUL shuffle_x1y1, ccx1y2, alpha;
--:-:-:-:1      FMUL shuffle_x2y1, ccx2y2, alpha;
--:-:-:-:1      FMUL shuffle_x3y1, ccx3y2, alpha;
--:-:-:-:1      FMUL shuffle_x4y1, ccx4y2, alpha;
--:-:-:-:1      FMUL shuffle_x5y1, ccx5y2, alpha;
--:-:-:-:1      FMUL shuffle_x6y1, ccx6y2, alpha;
--:-:-:-:1      FMUL shuffle_x7y1, ccx7y2, alpha;

--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0;
--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0;
--:-:-:-:1      STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1;
--:-:-:-:d      STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1;
</SCHEDULE_BLOCK>
--:-:-:-:5      BAR.SYNC 0;

--:-:-:-:5  @P4 BRA.U SKIP1;
--:-:-:-:5      CAL OUTPUT_TRANSFORM;
--:-:-:-:1      IADD k, k, 1;
--:-:-:-:1      IADD offsetO, offsetO, param_PQN;

SKIP1:

--:-:-:-:0      FMUL shuffle_x0y0, ccx0y1, alpha;
--:-:-:-:5      BAR.SYNC 0;
--:-:-:-:1      FMUL shuffle_x1y0, ccx1y1, alpha;
--:-:-:-:1      FMUL shuffle_x2y0, ccx2y1, alpha;
--:-:-:-:1      FMUL shuffle_x3y0, ccx3y1, alpha;
--:-:-:-:1      FMUL shuffle_x4y0, ccx4y1, alpha;
--:-:-:-:1      FMUL shuffle_x5y0, ccx5y1, alpha;
--:-:-:-:1      FMUL shuffle_x6y0, ccx6y1, alpha;
--:-:-:-:0      FMUL shuffle_x7y0, ccx7y1, alpha;
--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0;
--:-:-:-:1      FMUL shuffle_x0y1, ccx0y3, alpha;
--:-:-:-:1      FMUL shuffle_x1y1, ccx1y3, alpha;
--:-:-:-:1      FMUL shuffle_x2y1, ccx2y3, alpha;
--:-:-:-:0      FMUL shuffle_x3y1, ccx3y3, alpha;
--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0;
--:-:-:-:1      FMUL shuffle_x4y1, ccx4y3, alpha;
--:-:-:-:1      FMUL shuffle_x5y1, ccx5y3, alpha;
--:-:-:-:1      FMUL shuffle_x6y1, ccx6y3, alpha;
--:-:-:-:0      FMUL shuffle_x7y1, ccx7y3, alpha;
--:-:-:-:4      STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1;
--:-:-:-:d      STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1;
--:-:-:-:5      BAR.SYNC 0;

--:-:-:-:5  @P4 BRA.U SKIP2;
--:-:-:-:5      CAL OUTPUT_TRANSFORM;
--:-:-:-:1      IADD k, k, 15;
--:-:-:-:1      IADD offsetO, offsetO, param_PQN15;

SKIP2:

--:-:-:-:0      FMUL shuffle_x0y0, ccx0y4, alpha;
--:-:-:-:5      BAR.SYNC 0;
--:-:-:-:1      FMUL shuffle_x1y0, ccx1y4, alpha;
--:-:-:-:1      FMUL shuffle_x2y0, ccx2y4, alpha;
--:-:-:-:1      FMUL shuffle_x3y0, ccx3y4, alpha;
--:-:-:-:1      FMUL shuffle_x4y0, ccx4y4, alpha;
--:-:-:-:1      FMUL shuffle_x5y0, ccx5y4, alpha;
--:-:-:-:1      FMUL shuffle_x6y0, ccx6y4, alpha;
--:-:-:-:0      FMUL shuffle_x7y0, ccx7y4, alpha;
--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0;
--:-:-:-:1      FMUL shuffle_x0y1, ccx0y6, alpha;
--:-:-:-:1      FMUL shuffle_x1y1, ccx1y6, alpha;
--:-:-:-:1      FMUL shuffle_x2y1, ccx2y6, alpha;
--:-:-:-:0      FMUL shuffle_x3y1, ccx3y6, alpha;
--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0;
--:-:-:-:1      FMUL shuffle_x4y1, ccx4y6, alpha;
--:-:-:-:1      FMUL shuffle_x5y1, ccx5y6, alpha;
--:-:-:-:1      FMUL shuffle_x6y1, ccx6y6, alpha;
--:-:-:-:0      FMUL shuffle_x7y1, ccx7y6, alpha;
--:-:-:-:4      STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1;
--:-:-:-:d      STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1;
--:-:-:-:5      BAR.SYNC 0;

--:-:-:-:5  @P4 BRA.U SKIP3;
--:-:-:-:5      CAL OUTPUT_TRANSFORM;
--:-:-:-:1      IADD k, k, 1;
--:-:-:-:1      IADD offsetO, offsetO, param_PQN;

SKIP3:

--:-:-:-:0      FMUL shuffle_x0y0, ccx0y5, alpha;
--:-:-:-:5      BAR.SYNC 0;
--:-:-:-:1      FMUL shuffle_x1y0, ccx1y5, alpha;
--:-:-:-:1      FMUL shuffle_x2y0, ccx2y5, alpha;
--:-:-:-:1      FMUL shuffle_x3y0, ccx3y5, alpha;
--:-:-:-:1      FMUL shuffle_x4y0, ccx4y5, alpha;
--:-:-:-:1      FMUL shuffle_x5y0, ccx5y5, alpha;
--:-:-:-:1      FMUL shuffle_x6y0, ccx6y5, alpha;
--:-:-:-:0      FMUL shuffle_x7y0, ccx7y5, alpha;
--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0;
--:-:-:-:1      FMUL shuffle_x0y1, ccx0y7, alpha;
--:-:-:-:1      FMUL shuffle_x1y1, ccx1y7, alpha;
--:-:-:-:1      FMUL shuffle_x2y1, ccx2y7, alpha;
--:-:-:-:0      FMUL shuffle_x3y1, ccx3y7, alpha;
--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0;
--:-:-:-:1      FMUL shuffle_x4y1, ccx4y7, alpha;
--:-:-:-:1      FMUL shuffle_x5y1, ccx5y7, alpha;
--:-:-:-:1      FMUL shuffle_x6y1, ccx6y7, alpha;
--:-:-:-:0      FMUL shuffle_x7y1, ccx7y7, alpha;
--:-:-:-:4      STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1;
--:-:-:-:d      STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1;
--:-:-:-:5      BAR.SYNC 0;

--:-:-:-:5  @P4 BRA.U SKIP4;
--:-:-:-:5      CAL OUTPUT_TRANSFORM;
SKIP4:

--:-:-:-:5      EXIT;

OUTPUT_TRANSFORM:

<SCHEDULE_BLOCK>
11:-:-:-:1      ISETP.LT.AND P5, PT, k, param_K, PT;
[+
    our $bias;
    return $bias ? q{
--:-:-:-:1      LEA      Sum0.CC, k, param_S[0],     2;
--:-:-:-:1      LEA.HI.X Sum1,    k, param_S[1], RZ, 2;

--:-:-:-:1 @!P5 MOV bias, RZ;
--:-:5:-:1  @P5 LDG.E.CI bias, [Sum];
    } : '';
+]
</SCHEDULE_BLOCK>

[+
    my $out;
    foreach my $i (0 .. 2)
    {
        foreach my $j (0 .. 5)
        {
            my $b = $i + 1;
            $out .= "--:-:$b:-:1      LDS m$j$i, [readCs + 4x<($j*6+$i)*32>];\n";
        }
    }
    return $out;
+]

<SCHEDULE_BLOCK>
[+
    my $out; our ($trans1, $trans2, $trans3);
    foreach my $i (0 .. 2)
    {
        my $w = sprintf "%02x", 1 << $i;
        $out .= qq{
<ORDERED>
$w:-:-:-:1      FADD t0$i, m1$i,  m2$i;
--:-:-:-:1      FADD t1$i, m1$i, -m2$i;
--:-:-:-:1      FADD t2$i, m3$i, -m4$i;
--:-:-:-:1      FADD t3$i, m3$i,  m4$i;
--:-:-:-:1      FADD w0$i, t0$i,  m0$i;
--:-:-:-:1      FMUL32I w3$i, t1$i, $trans1;
--:-:-:-:1      FMUL32I w1$i, t1$i, $trans2;
--:-:-:-:1      FMUL32I temp, t0$i, $trans3;
--:-:-:-:1      FFMA w3$i, t2$i,  3.375, w3$i;
--:-:-:-:1      FFMA w1$i, t2$i,  1.500, w1$i;
--:-:-:-:1      FFMA w2$i, t3$i,  2.250, temp;
--:-:-:-:1      FADD w0$i, w0$i,  t3$i;
--:-:-:-:1      FADD w3$i, w3$i,  m5$i;
</ORDERED>
        };
    }
    foreach my $i (3 .. 5)
    {
        foreach my $j (0 .. 5)
        {
            my $b = $i + 1;
            $out .= "--:-:$b:-:1      LDS m$j$i, [readCs + 4x<($j*6+$i)*32>];\n";
        }
    }
    return $out;
+]
</SCHEDULE_BLOCK>

<SCHEDULE_BLOCK>
[+
    my $out; our ($trans1, $trans2, $trans3);

    foreach my $i (3 .. 5)
    {
        my $w = sprintf "%02x", 1 << $i;
        $out .= qq{
<ORDERED>
$w:-:-:-:1      FADD t0$i, m1$i,  m2$i;
--:-:-:-:1      FADD t1$i, m1$i, -m2$i;
--:-:-:-:1      FADD t2$i, m3$i, -m4$i;
--:-:-:-:1      FADD t3$i, m3$i,  m4$i;
--:-:-:-:1      FADD w0$i, t0$i,  m0$i;
--:-:-:-:1      FMUL32I w3$i, t1$i, $trans1;
--:-:-:-:1      FMUL32I w1$i, t1$i, $trans2;
--:-:-:-:1      FMUL32I temp, t0$i, $trans3;
--:-:-:-:1      FFMA w3$i, t2$i,  3.375, w3$i;
--:-:-:-:1      FFMA w1$i, t2$i,  1.500, w1$i;
--:-:-:-:1      FFMA w2$i, t3$i,  2.250, temp;
--:-:-:-:1      FADD w0$i, w0$i,  t3$i;
--:-:-:-:1      FADD w3$i, w3$i,  m5$i;
</ORDERED>
        };
    }
    return $out;
+]
--:-:-:-:1  @P5 R2P PR, preds, 0x0f;
--:-:-:-:1 @!P5 R2P PR, RZ,    0x0f;
--:-:-:-:1      SHF.R.U64 preds, preds, 4, preds;
</SCHEDULE_BLOCK>

<SCHEDULE_BLOCK>
[+
    my $out;
    our ($convert_out, $bias, $relu, $trans1, $trans2, $trans3);
    foreach my $i (0 .. 3)
    {
        $out .= qq{
--:-:-:-:1      FADD r${i}0, w${i}1,  w${i}2;
--:-:-:-:1      FADD r${i}1, w${i}1, -w${i}2;
--:-:-:-:1      FADD r${i}2, w${i}3, -w${i}4;
--:-:-:-:1      FADD r${i}3, w${i}3,  w${i}4;
--:-:-:-:1      FADD s${i}0, r${i}0,  w${i}0;
--:-:-:-:1      FMUL32I s${i}3, r${i}1, $trans1;
--:-:-:-:1      FMUL32I s${i}1, r${i}1, $trans2;
--:-:-:-:1      FMUL32I temp,   r${i}0, $trans3;
--:-:-:-:1      FFMA s${i}3, r${i}2,  3.375, s${i}3;
--:-:-:-:1      FFMA s${i}1, r${i}2,  1.500, s${i}1;
--:-:-:-:1      FFMA s${i}2, r${i}3,  2.250, temp;
--:-:-:-:1      FADD s${i}0, s${i}0,  r${i}3;
--:-:-:-:1      FADD s${i}3, s${i}3,  w${i}5;
        };
        if ($bias)
        {
            $out .= qq{
10:-:-:-:1      FADD s${i}0, s${i}0, bias;
--:-:-:-:1      FADD s${i}1, s${i}1, bias;
--:-:-:-:1      FADD s${i}2, s${i}2, bias;
--:-:-:-:1      FADD s${i}3, s${i}3, bias;};
        }
        if ($relu)
        {
            $out .= qq{
--:-:-:-:1      FMNMX s${i}0, s${i}0, RZ, !PT;
--:-:-:-:1      FMNMX s${i}1, s${i}1, RZ, !PT;
--:-:-:-:1      FMNMX s${i}2, s${i}2, RZ, !PT;
--:-:-:-:1      FMNMX s${i}3, s${i}3, RZ, !PT;};
        }
    }
    return $out;
+]
</SCHEDULE_BLOCK>
<SCHEDULE_BLOCK>
[+
    our $prelu; my $out;
    if ($prelu)
    {
        foreach my $i (0 .. 3)
        {
            $out .= qq{
// maximum(x, 0) + beta * minimum(0, x)
--:-:-:-:1      FMNMX b00, s${i}0, RZ, !PT;
--:-:-:-:1      FMNMX b01, s${i}1, RZ, !PT;
--:-:-:-:1      FMNMX b02, s${i}2, RZ, !PT;
--:-:-:-:1      FMNMX b03, s${i}3, RZ, !PT;

--:-:-:-:1      FMNMX b10, s${i}0, RZ, PT;
--:-:-:-:1      FMNMX b11, s${i}1, RZ, PT;
--:-:-:-:1      FMNMX b12, s${i}2, RZ, PT;
--:-:-:-:1      FMNMX b13, s${i}3, RZ, PT;

--:-:-:-:1      FFMA s${i}0, b10, param_beta, b00;
--:-:-:-:1      FFMA s${i}1, b11, param_beta, b01;
--:-:-:-:1      FFMA s${i}2, b12, param_beta, b02;
--:-:-:-:1      FFMA s${i}3, b13, param_beta, b03;
            };
        }
    }
    return $out;
+]
</SCHEDULE_BLOCK>
[+
    our ($beta, $brelu, $bprelu, $dtype, $dsize, $dshift, $convert_out, $Q, $N);
    my $out;
    if ($beta || $brelu || $bprelu)
    {
        my $preds = $beta ? q{
--:-:-:-:1  @P5 R2P PR, preds, 0x0f;
--:-:-:-:1 @!P5 R2P PR, RZ,    0x0f;
--:-:-:-:1      SHF.R.U64 preds, preds, 4, preds;
        } : '';

        $out .= qq{
<SCHEDULE_BLOCK>
--:-:-:-:1      LEA      Out0.CC, offsetO, param_X[0],     $dshift;
--:-:-:-:1      LEA.HI.X Out1,    offsetO, param_X[1], RZ, $dshift;

--:-:-:-:1  \@P0 LDG.E.CG.$dtype b00, [Out + ${dsize}x<0*$Q*$N + 0*$N>];
--:-:-:-:1  \@P1 LDG.E.CG.$dtype b01, [Out + ${dsize}x<0*$Q*$N + 1*$N>];
--:-:-:-:1  \@P2 LDG.E.CG.$dtype b02, [Out + ${dsize}x<0*$Q*$N + 2*$N>];
--:-:1:-:1  \@P3 LDG.E.CG.$dtype b03, [Out + ${dsize}x<0*$Q*$N + 3*$N>];
--:-:-:-:1 \@!P0 MOV b00, RZ;
--:-:-:-:1 \@!P1 MOV b01, RZ;
--:-:-:-:1 \@!P2 MOV b02, RZ;
--:-:-:-:1 \@!P3 MOV b03, RZ;
--:-:-:-:1  \@P5 R2P PR, preds, 0x0f;
--:-:-:-:1 \@!P5 R2P PR, RZ,    0x0f;
--:-:-:-:1      SHF.R.U64 preds, preds, 4, preds;

--:-:-:-:1  \@P0 LDG.E.CG.$dtype b10, [Out + ${dsize}x<1*$Q*$N + 0*$N>];
--:-:-:-:1  \@P1 LDG.E.CG.$dtype b11, [Out + ${dsize}x<1*$Q*$N + 1*$N>];
--:-:-:-:1  \@P2 LDG.E.CG.$dtype b12, [Out + ${dsize}x<1*$Q*$N + 2*$N>];
--:-:2:-:1  \@P3 LDG.E.CG.$dtype b13, [Out + ${dsize}x<1*$Q*$N + 3*$N>];
--:-:-:-:1 \@!P0 MOV b10, RZ;
--:-:-:-:1 \@!P1 MOV b11, RZ;
--:-:-:-:1 \@!P2 MOV b12, RZ;
--:-:-:-:1 \@!P3 MOV b13, RZ;
--:-:-:-:1  \@P5 R2P PR, preds, 0x0f;
--:-:-:-:1 \@!P5 R2P PR, RZ,    0x0f;
--:-:-:-:1      SHF.R.U64 preds, preds, 4, preds;

--:-:-:-:1  \@P0 LDG.E.CG.$dtype b20, [Out + ${dsize}x<2*$Q*$N + 0*$N>];
--:-:-:-:1  \@P1 LDG.E.CG.$dtype b21, [Out + ${dsize}x<2*$Q*$N + 1*$N>];
--:-:-:-:1  \@P2 LDG.E.CG.$dtype b22, [Out + ${dsize}x<2*$Q*$N + 2*$N>];
--:-:3:-:1  \@P3 LDG.E.CG.$dtype b23, [Out + ${dsize}x<2*$Q*$N + 3*$N>];
--:-:-:-:1 \@!P0 MOV b20, RZ;
--:-:-:-:1 \@!P1 MOV b21, RZ;
--:-:-:-:1 \@!P2 MOV b22, RZ;
--:-:-:-:1 \@!P3 MOV b23, RZ;
--:-:-:-:1  \@P5 R2P PR, preds, 0x0f;
--:-:-:-:1 \@!P5 R2P PR, RZ,    0x0f;
--:-:-:-:1      SHF.L.U64 preds, preds, 12, preds;

--:-:-:-:1  \@P0 LDG.E.CG.$dtype b30, [Out + ${dsize}x<3*$Q*$N + 0*$N>];
--:-:-:-:1  \@P1 LDG.E.CG.$dtype b31, [Out + ${dsize}x<3*$Q*$N + 1*$N>];
--:-:-:-:1  \@P2 LDG.E.CG.$dtype b32, [Out + ${dsize}x<3*$Q*$N + 2*$N>];
--:-:4:-:1  \@P3 LDG.E.CG.$dtype b33, [Out + ${dsize}x<3*$Q*$N + 3*$N>];
--:-:-:-:1 \@!P0 MOV b30, RZ;
--:-:-:-:1 \@!P1 MOV b31, RZ;
--:-:-:-:1 \@!P2 MOV b32, RZ;
--:-:-:-:1 \@!P3 MOV b33, RZ;$preds
</SCHEDULE_BLOCK>};

        if ($convert_out)
        {
            $out .= q{
01:-:-:-:1      F2F.F32.F16 b00, b00;
--:-:-:-:1      F2F.F32.F16 b01, b01;
--:-:-:-:1      F2F.F32.F16 b02, b02;
--:-:1:-:1      F2F.F32.F16 b03, b03;
02:-:-:-:1      F2F.F32.F16 b10, b10;
--:-:-:-:1      F2F.F32.F16 b11, b11;
--:-:-:-:1      F2F.F32.F16 b12, b12;
--:-:2:-:1      F2F.F32.F16 b13, b13;
04:-:-:-:1      F2F.F32.F16 b20, b20;
--:-:-:-:1      F2F.F32.F16 b21, b21;
--:-:-:-:1      F2F.F32.F16 b22, b22;
--:-:3:-:1      F2F.F32.F16 b23, b23;
08:-:-:-:1      F2F.F32.F16 b30, b30;
--:-:-:-:1      F2F.F32.F16 b31, b31;
--:-:-:-:1      F2F.F32.F16 b32, b32;
--:-:4:-:1      F2F.F32.F16 b33, b33;};
        }
    }
    return $out;
+]

<SCHEDULE_BLOCK>
[+
    our $beta; return $beta ? q{
01:-:-:-:1      FFMA s00, b00, param_beta, s00;
--:-:-:-:1      FFMA s01, b01, param_beta, s01;
--:-:-:-:1      FFMA s02, b02, param_beta, s02;
--:-:-:-:1      FFMA s03, b03, param_beta, s03;
02:-:-:-:1      FFMA s10, b10, param_beta, s10;
--:-:-:-:1      FFMA s11, b11, param_beta, s11;
--:-:-:-:1      FFMA s12, b12, param_beta, s12;
--:-:-:-:1      FFMA s13, b13, param_beta, s13;
04:-:-:-:1      FFMA s20, b20, param_beta, s20;
--:-:-:-:1      FFMA s21, b21, param_beta, s21;
--:-:-:-:1      FFMA s22, b22, param_beta, s22;
--:-:-:-:1      FFMA s23, b23, param_beta, s23;
08:-:-:-:1      FFMA s30, b30, param_beta, s30;
--:-:-:-:1      FFMA s31, b31, param_beta, s31;
--:-:-:-:1      FFMA s32, b32, param_beta, s32;
--:-:-:-:1      FFMA s33, b33, param_beta, s33;} : '';
+]
[+
    our ($brelu, $bprelu); my $out;
    if ($brelu || $bprelu)
    {
        foreach my $i (0 .. 3)
        {
            my $w = sprintf "%02x", 1 << $i;
            $out .= $brelu ? qq{
//delta *= (x > 0)
$w:-:-:-:1      FSETP.GT.AND P0, PT, b${i}0, RZ, PT;
--:-:-:-:1      FSETP.GT.AND P1, PT, b${i}1, RZ, PT;
--:-:-:-:1      FSETP.GT.AND P2, PT, b${i}2, RZ, PT;
--:-:-:-:1      FSETP.GT.AND P3, PT, b${i}3, RZ, PT;
--:-:-:-:1 \@!P0 MOV s${i}0, RZ;
--:-:-:-:1 \@!P1 MOV s${i}1, RZ;
--:-:-:-:1 \@!P2 MOV s${i}2, RZ;
--:-:-:-:1 \@!P3 MOV s${i}3, RZ;
            } : qq{
//delta *= ((x > 0) + slope * (x < 0))
$w:-:-:-:1      FSETP.GT.AND P0, PT, b${i}0, RZ, PT;
--:-:-:-:1      FSETP.GT.AND P1, PT, b${i}1, RZ, PT;
--:-:-:-:1      FSETP.GT.AND P2, PT, b${i}2, RZ, PT;
--:-:-:-:1      FSETP.GT.AND P3, PT, b${i}3, RZ, PT;
--:-:-:-:1      SEL xx0, one, RZ, P0;
--:-:-:-:1      SEL xx1, one, RZ, P1;
--:-:-:-:1      SEL xx2, one, RZ, P2;
--:-:-:-:1      SEL xx3, one, RZ, P3;
--:-:-:-:1      FSETP.LT.AND P0, PT, b${i}0, RZ, PT;
--:-:-:-:1      FSETP.LT.AND P1, PT, b${i}1, RZ, PT;
--:-:-:-:1      FSETP.LT.AND P2, PT, b${i}2, RZ, PT;
--:-:-:-:1      FSETP.LT.AND P3, PT, b${i}3, RZ, PT;
--:-:-:-:1      SEL b${i}0, one, RZ, P0;
--:-:-:-:1      SEL b${i}1, one, RZ, P1;
--:-:-:-:1      SEL b${i}2, one, RZ, P2;
--:-:-:-:1      SEL b${i}3, one, RZ, P3;
--:-:-:-:1      FFMA b${i}0, b${i}0, param_beta, xx0;
--:-:-:-:1      FFMA b${i}1, b${i}1, param_beta, xx1;
--:-:-:-:1      FFMA b${i}2, b${i}2, param_beta, xx2;
--:-:-:-:1      FFMA b${i}3, b${i}3, param_beta, xx3;
--:-:-:-:1      FMUL s${i}0, s${i}0, b${i}0;
--:-:-:-:1      FMUL s${i}1, s${i}1, b${i}1;
--:-:-:-:1      FMUL s${i}2, s${i}2, b${i}2;
--:-:-:-:1      FMUL s${i}3, s${i}3, b${i}3;
            };
        }
        $out .= q{
--:-:-:-:1  @P5 R2P PR, preds, 0x0f;
--:-:-:-:5 @!P5 R2P PR, RZ, 0x0f;
--:-:-:-:5      SHF.R.U64 preds, preds, 4, preds;
};
    }
    return $out;
+]
</SCHEDULE_BLOCK>
[+
    our $bsum; my $out;
    if ($bsum)
    {
        $out = q{
<SCHEDULE_BLOCK>
--:-:-:-:1      XMAD.LO2C bias, k, param_gridPQN, bsum_offset;
--:-:-:-:1      LEA      Sum0.CC, bias, param_S[0],     2;
--:-:-:-:1      LEA.HI.X Sum1,    bias, param_S[1], RZ, 2;
--:-:-:-:1      MOV  sum0, RZ;
--:-:-:-:1      MOV  sum1, RZ;
--:-:-:-:1      MOV  sum2, RZ;
--:-:-:-:1      MOV  sum3, RZ;};
        foreach my $i (0 .. 3)
        {
            my ($dir, $amt) = $i == 2 ? ('L','12') : ('R','4');
            $out .= qq{
--:-:-:-:1  \@P0 FADD sum0, sum0, s${i}0;
--:-:-:-:1  \@P1 FADD sum1, sum1, s${i}1;
--:-:-:-:1  \@P2 FADD sum2, sum2, s${i}2;
--:-:-:-:1  \@P3 FADD sum3, sum3, s${i}3;
--:-:-:-:1  \@P5 R2P PR, preds, 0x0f;
--:-:-:-:1 \@!P5 R2P PR, RZ,    0x0f;
--:-:-:-:1      SHF.$dir.U64 preds, preds, $amt, preds;};
        }
        $out .= q{
--:-:-:-:1      FADD sum0, sum0, sum1;
--:-:-:-:1      FADD sum2, sum2, sum3;
--:-:-:-:1      FADD sum0, sum0, sum2;
</SCHEDULE_BLOCK>};
    }
    return $out;
+]
[+
    our $convert_out; return $convert_out ? q{
--:-:-:-:1      F2F.F16.F32 s00, s00;
--:-:-:-:1      F2F.F16.F32 s01, s01;
--:-:-:-:1      F2F.F16.F32 s02, s02;
--:-:1:-:1      F2F.F16.F32 s03, s03;
--:-:-:-:1      F2F.F16.F32 s10, s10;
--:-:-:-:1      F2F.F16.F32 s11, s11;
--:-:-:-:1      F2F.F16.F32 s12, s12;
--:-:2:-:1      F2F.F16.F32 s13, s13;
--:-:-:-:1      F2F.F16.F32 s20, s20;
--:-:-:-:1      F2F.F16.F32 s21, s21;
--:-:-:-:1      F2F.F16.F32 s22, s22;
--:-:3:-:1      F2F.F16.F32 s23, s23;
--:-:-:-:1      F2F.F16.F32 s30, s30;
--:-:-:-:1      F2F.F16.F32 s31, s31;
--:-:-:-:1      F2F.F16.F32 s32, s32;
--:-:4:-:1      F2F.F16.F32 s33, s33;} : '';
+]

[+
    our ($bsum, $dtype, $dsize, $dshift, $Q, $N);
    return $bsum ? qq{
--:-:-:Y:6      LEA      Out0.CC, offsetO, param_O[0],     $dshift;
--:-:-:-:0      LEA.HI.X Out1,    offsetO, param_O[1], RZ, $dshift;
--:-:5:-:2      SHFL.BFLY PT, sum1, sum0,  1, 0x1f;
01:-:-:-:1  \@P0 STG.E.CG.$dtype [Out + ${dsize}x<0*$Q*$N + 0*$N>], s00;
--:-:-:-:1  \@P1 STG.E.CG.$dtype [Out + ${dsize}x<0*$Q*$N + 1*$N>], s01;
--:-:-:-:1  \@P2 STG.E.CG.$dtype [Out + ${dsize}x<0*$Q*$N + 2*$N>], s02;
--:-:-:-:1  \@P3 STG.E.CG.$dtype [Out + ${dsize}x<0*$Q*$N + 3*$N>], s03;
--:-:-:-:2  \@P5 R2P PR, preds, 0x0f;
--:-:-:Y:7 \@!P5 R2P PR, RZ, 0x0f;

10:-:-:-:4      FADD sum0, sum1, sum0;
--:-:-:-:0      SHF.R.U64 preds, preds, 4, preds;
--:-:5:-:2      SHFL.BFLY PT, sum1, sum0, 2, 0x1f;

02:-:-:-:1  \@P0 STG.E.CG.$dtype [Out + ${dsize}x<1*$Q*$N + 0*$N>], s10;
--:-:-:-:1  \@P1 STG.E.CG.$dtype [Out + ${dsize}x<1*$Q*$N + 1*$N>], s11;
--:-:-:-:1  \@P2 STG.E.CG.$dtype [Out + ${dsize}x<1*$Q*$N + 2*$N>], s12;
--:-:-:-:1  \@P3 STG.E.CG.$dtype [Out + ${dsize}x<1*$Q*$N + 3*$N>], s13;
--:-:-:-:2  \@P5 R2P PR, preds, 0x0f;
--:-:-:Y:7 \@!P5 R2P PR, RZ, 0x0f;

10:-:-:-:4      FADD sum0, sum1, sum0;
--:-:-:-:0      SHF.R.U64 preds, preds, 4, preds;
--:-:5:-:2      SHFL.BFLY PT, sum1, sum0, 4, 0x1f;

04:-:-:-:1  \@P0 STG.E.CG.$dtype [Out + ${dsize}x<2*$Q*$N + 0*$N>], s20;
--:-:-:-:1  \@P1 STG.E.CG.$dtype [Out + ${dsize}x<2*$Q*$N + 1*$N>], s21;
--:-:-:-:1  \@P2 STG.E.CG.$dtype [Out + ${dsize}x<2*$Q*$N + 2*$N>], s22;
--:-:-:-:1  \@P3 STG.E.CG.$dtype [Out + ${dsize}x<2*$Q*$N + 3*$N>], s23;
--:-:-:-:2  \@P5 R2P PR, preds, 0x0f;
--:-:-:Y:7 \@!P5 R2P PR, RZ, 0x0f;

10:-:-:-:4      FADD sum0, sum1, sum0;
--:-:-:-:0      SHF.L.U64 preds, preds, 12, preds;
--:-:5:-:2      SHFL.BFLY PT, sum1, sum0, 8, 0x1f;

08:-:-:-:1  \@P0 STG.E.CG.$dtype [Out + ${dsize}x<3*$Q*$N + 0*$N>], s30;
--:-:-:-:1  \@P1 STG.E.CG.$dtype [Out + ${dsize}x<3*$Q*$N + 1*$N>], s31;
--:-:-:-:1  \@P2 STG.E.CG.$dtype [Out + ${dsize}x<3*$Q*$N + 2*$N>], s32;
--:1:-:-:1  \@P3 STG.E.CG.$dtype [Out + ${dsize}x<3*$Q*$N + 3*$N>], s33;

10:-:-:-:4      FADD sum0, sum1, sum0;
--:-:-:-:0      PSETP.AND.AND P5, PT, P5, P6, PT; // k < K && tid31 == 0
--:-:5:-:2      SHFL.BFLY PT, sum1, sum0, 16, 0x1f;
10:-:-:-:2      FADD sum0, sum1, sum0;
--:5:-:-:1  \@P5 STG.E.CG [Sum], sum0;
    } : qq{
<SCHEDULE_BLOCK>
<ORDERED>
--:-:-:-:1      LEA      Out0.CC, offsetO, param_O[0],     $dshift;
--:-:-:-:1      LEA.HI.X Out1,    offsetO, param_O[1], RZ, $dshift;

01:-:-:-:1  \@P0 STG.E.CG.$dtype [Out + ${dsize}x<0*$Q*$N + 0*$N>], s00;
--:-:-:-:1  \@P1 STG.E.CG.$dtype [Out + ${dsize}x<0*$Q*$N + 1*$N>], s01;
--:-:-:-:1  \@P2 STG.E.CG.$dtype [Out + ${dsize}x<0*$Q*$N + 2*$N>], s02;
--:-:-:-:1  \@P3 STG.E.CG.$dtype [Out + ${dsize}x<0*$Q*$N + 3*$N>], s03;

--:-:-:-:1  \@P5 R2P PR, preds, 0x0f;
--:-:-:-:1 \@!P5 R2P PR, RZ,    0x0f;
--:-:-:-:1      SHF.R.U64 preds, preds, 4, preds;

02:-:-:-:1  \@P0 STG.E.CG.$dtype [Out + ${dsize}x<1*$Q*$N + 0*$N>], s10;
--:-:-:-:1  \@P1 STG.E.CG.$dtype [Out + ${dsize}x<1*$Q*$N + 1*$N>], s11;
--:-:-:-:1  \@P2 STG.E.CG.$dtype [Out + ${dsize}x<1*$Q*$N + 2*$N>], s12;
--:-:-:-:1  \@P3 STG.E.CG.$dtype [Out + ${dsize}x<1*$Q*$N + 3*$N>], s13;

--:-:-:-:1  \@P5 R2P PR, preds, 0x0f;
--:-:-:-:1 \@!P5 R2P PR, RZ,    0x0f;
--:-:-:-:1      SHF.R.U64 preds, preds, 4, preds;

04:-:-:-:1  \@P0 STG.E.CG.$dtype [Out + ${dsize}x<2*$Q*$N + 0*$N>], s20;
--:-:-:-:1  \@P1 STG.E.CG.$dtype [Out + ${dsize}x<2*$Q*$N + 1*$N>], s21;
--:-:-:-:1  \@P2 STG.E.CG.$dtype [Out + ${dsize}x<2*$Q*$N + 2*$N>], s22;
--:-:-:-:1  \@P3 STG.E.CG.$dtype [Out + ${dsize}x<2*$Q*$N + 3*$N>], s23;

--:-:-:-:1  \@P5 R2P PR, preds, 0x0f;
--:-:-:-:1 \@!P5 R2P PR, RZ,    0x0f;
--:-:-:-:1      SHF.L.U64 preds, preds, 12, preds;

08:-:-:-:1  \@P0 STG.E.CG.$dtype [Out + ${dsize}x<3*$Q*$N + 0*$N>], s30;
--:-:-:-:1  \@P1 STG.E.CG.$dtype [Out + ${dsize}x<3*$Q*$N + 1*$N>], s31;
--:-:-:-:1  \@P2 STG.E.CG.$dtype [Out + ${dsize}x<3*$Q*$N + 2*$N>], s32;
--:1:-:-:1  \@P3 STG.E.CG.$dtype [Out + ${dsize}x<3*$Q*$N + 3*$N>], s33;
</ORDERED>
</SCHEDULE_BLOCK>
    };
+]

--:-:-:-:5      RET;
